1 Sandbox

a <- c(2, 4, 4, 6)
b <- c(5, 5, 7, 8)
c <- c(9, 9, 9, 8)
d <- c(1, 2, 3, 3)
#row bind four vectors into matrix
mat <- rbind(a, b, c, d)

#view matrix
mat

dist(mat, method="euclidean")
dist(mat, method="manhattan")
dist(mat, method="minkowski")

# generate matrices
set.seed(50)
unitib = gen_random_matrix('uniform', 1000, 10)
normtib = gen_random_matrix('normal', 8, 3)
gtib = gen_random_matrix('2groups', 20, 3, group_split = .2)
zerotib = gen_const_matrix(8,3,0)
onetib = gen_const_matrix(8,3,1)

gtib = gen_random_matrix('3groups', 100, 5)
ugtib = dist_uniq(gtib, 'euclidean')
desc_stats_uniq(ugtib)

gtib = gen_random_matrix('2groups', 100, 2)
ugtib = dist_uniq(gtib, 'euclidean')
print(desc_stats_uniq(ugtib))

tib = gen_random_matrix('normal', 1000, 5)
ugtib = dist_uniq(tib, 'euclidean')
print(desc_stats_uniq(ugtib))

ugtib = dist_uniq(tib, 'euclidean')
ugtib = dist_uniq(tib, 'mahalanobis')
ugtib = dist_uniq(tib, 'manhattan')
desc_stats_uniq(ugtib)

cov(gen_random_matrix('2groups',100,100))
ugtib = dist_uniq(gen_random_matrix('2groups',100,100), 'mahalanobis')
tib = gen_random_matrix('2groups',100,100)
maha_sum_dists = mahalanobis(tib, colMeans(tib), cov(tib), tol=1e-20)

# test uniqueness
tib = unitib

if (F){
  # scale variables 0 1
  #tib = tib %>% mutate_if(is.numeric, range01)
  
}
#cos = tcosine(tib)

#mat = gen_random_matrix('normal', 1000, 100)
#bigdist(as.matrix(mat), 'analysis/dist_matrix.tmp', method='euclidean')
#dist_uniq()

#utib$sum_dists_div = sum_dists / nrow(utib)
#utib$sum_dists_log = log10(utib$sum_dists+1)
#utib$sum_dists_sqrt = sqrt(utib$sum_dists)
#n = 3
#utib$sum_dists_nrt = utib$sum_dists ^ (1 / n)
#utib$uniq_lin = 1/(1+utib$sum_dists)
#utib$uniq_log = 1/(1+utib$sum_dists_log)
#utib$uniq_sqrt = 1/(1+utib$sum_dists_sqrt)
#utib$sum_cos = rowSums(cos)
#utib$sum_cos_pos = utib$sum_cos-min(utib$sum_cos)
#utib$sum_cos_log = log10(utib$sum_cos+1)

#View(round(utib,4))
#summary(utib)

# plot histograms
#tib_long <- utib %>% pivot_longer(colnames(utib)) %>% as_tibble()
#gp1 <- ggplot(tib_long, aes(x = value)) +    # Draw each column as histogram
  #geom_histogram(bins = 10) + 
#  geom_density(adjust = 1/2) + 
#  facet_wrap(~ name, scales = "free_x")
#print(gp1)

# characteristics of uniq to be saved for diagnostic
#x = scale(utib$sum_dists)

#desc_stats(x)

2 Outlier detection

2.1 Univariate OD

This is similar to outlier detection, in the sense of detection of extreme values, and not of measurement errors. For a single variable, there are well-known methods, e.g., boxplots, n StDev, etc.

“Enderlein (1987) goes even further as the author considers outliers as values that deviate so much from other observations one might suppose a different underlying sampling mechanism. … it sometimes makes sense to formally distinguish two classes of outliers: (i) extreme values and (ii) mistakes.” https://statsandr.com/blog/outliers-detection-in-r

2.2 Multivariate OD

In a multi-variate context, the outliers we are interested in are rare combinations of values. Individually, the values of each variable might not be extreme, but the combination appears relatively far from others.

“The adjusted quantile plot function of the mvoutlier package will solve for and order the squared Mahalanobis Distances for the given observations and plot them against the empirical Chi-Squared distribution function of these values.” https://rpubs.com/Treegonaut/301942

# use mvoutlier
indf = as.data.frame(gen_random_matrix('2groups', 20, 3))

outliers <- aq.plot(indf, delta=qchisq(0.975, df = ncol(indf)))

2.3 Mahalanobis distance

The Mahalnobis distance is often used for OD in multivariate data.

“In multivariate space, the Mahalanobis distance is the distance between two points. It’s frequently used to locate outliers in statistical investigations involving several variables.” Source: https://www.r-bloggers.com/2021/08/how-to-calculate-mahalanobis-distance-in-r/

intib = gen_random_matrix('2groups', 20, 4)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
intib
##               V1           V2           V3            V4
## 1      1.1641803   -0.1948692   -0.4252416   -0.22249308
## 2    424.9662196  266.5979769 -101.5830758  120.47922412
## 3  -1819.0693641 1951.0394591 -140.3411740  216.32643610
## 4      0.1291146    0.1794147    0.7936816    0.55376173
## 5      0.5753677   -0.2868437    0.5837242   -0.43088498
## 6     -0.4859220    0.6387479   -0.9638295    0.06068727
## 7      0.2567009    0.6207820   -0.4172149    0.37015321
## 8     -0.7100911    1.3510034   -0.4412846    0.01016097
## 9     -1.6257058    1.5630141   -0.7880726   -0.95895454
## 10    -1.2711655    1.1788008    0.9085291    1.23311558
## 11     0.1919319   -1.0570429    0.5355723    0.98882670
## 12    -0.4938220    0.4070899   -0.4872650   -0.42588735
## 13    -1.6795307    1.8772010   -0.4288508    0.40217567
## 14    -0.6717679   -0.2808324   -0.2731496    1.64284754
## 15     0.5810927    0.1421171   -0.5369906   -0.23387121
## 16    -0.1177264   -1.8474959    0.6532794    1.04005465
## 17   408.4206719   99.9786286 1757.5022732 2498.60880008
## 18  -488.8363871 1247.0478823 1830.2949026  -25.89555311
## 19     0.4493133    0.5170706    1.1878005   -0.17358978
## 20     0.3674473    0.6878928    0.7081729   -0.78370560
# x: input data
# center: indicate the mean vector of the distribution
# cov: indicate the covariance matrix of the distribution
intib$maha_dist = mahalanobis(intib, colMeans(intib), cov(intib))

# The p-value is the Chi-Square statistic of the
# Mahalanobis distance with k-1 degrees of freedom, 
# where k is the number of variables.
intib$maha_pvalue <- pchisq(intib$maha_dist, df=ncol(intib), lower.tail=F)
#View(intib)
intib$maha_dist
##  [1]  0.2356693 18.0496287 18.0499380  0.2372853  0.2383430  0.2367667
##  [7]  0.2347773  0.2350168  0.2373395  0.2371455  0.2414191  0.2382379
## [13]  0.2355562  0.2401074  0.2360051  0.2454820 18.0499686 18.0499539
## [19]  0.2357943  0.2355655
intib
##               V1           V2           V3            V4  maha_dist maha_pvalue
## 1      1.1641803   -0.1948692   -0.4252416   -0.22249308  0.2356693 0.998681165
## 2    424.9662196  266.5979769 -101.5830758  120.47922412 18.0496287 0.002884845
## 3  -1819.0693641 1951.0394591 -140.3411740  216.32643610 18.0499380 0.002884465
## 4      0.1291146    0.1794147    0.7936816    0.55376173  0.2372853 0.998659209
## 5      0.5753677   -0.2868437    0.5837242   -0.43088498  0.2383430 0.998644726
## 6     -0.4859220    0.6387479   -0.9638295    0.06068727  0.2367667 0.998666277
## 7      0.2567009    0.6207820   -0.4172149    0.37015321  0.2347773 0.998693196
## 8     -0.7100911    1.3510034   -0.4412846    0.01016097  0.2350168 0.998689972
## 9     -1.6257058    1.5630141   -0.7880726   -0.95895454  0.2373395 0.998658469
## 10    -1.2711655    1.1788008    0.9085291    1.23311558  0.2371455 0.998661116
## 11     0.1919319   -1.0570429    0.5355723    0.98882670  0.2414191 0.998602097
## 12    -0.4938220    0.4070899   -0.4872650   -0.42588735  0.2382379 0.998646169
## 13    -1.6795307    1.8772010   -0.4288508    0.40217567  0.2355562 0.998682694
## 14    -0.6717679   -0.2808324   -0.2731496    1.64284754  0.2401074 0.998620367
## 15     0.5810927    0.1421171   -0.5369906   -0.23387121  0.2360051 0.998676620
## 16    -0.1177264   -1.8474959    0.6532794    1.04005465  0.2454820 0.998544636
## 17   408.4206719   99.9786286 1757.5022732 2498.60880008 18.0499686 0.002884428
## 18  -488.8363871 1247.0478823 1830.2949026  -25.89555311 18.0499539 0.002884446
## 19     0.4493133    0.5170706    1.1878005   -0.17358978  0.2357943 0.998679475
## 20     0.3674473    0.6878928    0.7081729   -0.78370560  0.2355655 0.998682569

2.4 Sum of distances

This shows the behaviour of the sum of distances on clustered data. The visualisation shows the distances of each point to all other points.

gauss_mix = generate_gaussian_mix(3, # number of clusters
  list(sample(1:10, 2), sample(1:10, 2), sample(1:20, 2)), # centres
  sample(1:40, 3)) # sizes

# calculate sum of all distances for each point
gauss_mix$dist_sum = dist(gauss_mix, diag = T, upper = T) %>% as.matrix() %>% rowSums()
gauss_mix$dist_sum_z = zscore(gauss_mix$dist_sum)
# plot results

# scatter plot
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum, size=log(dist_sum))) + 
  scale_colour_gradient() + ggtitle('Sum of distances') + 
  theme_bw()

# scatter plot z score
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum_z, size=dist_sum_z)) + 
   scale_colour_distiller(palette='RdYlBu',direction=1) + 
  ggtitle('Sum of distances (z score)') + 
  theme_bw()

# histogram
ggplot(gauss_mix, aes(dist_sum)) + geom_histogram(bins = 20)

3 Uniqueness

3.1 Univariate uniqueness

Uniqueness can be thought of as \(1 - p\), where p is the probability of encountering a particular observation from random extractions from a set.

For example, these are the percentages of land cover categories in the UK: Farmland 56.7%, Natural 34.9%, Green urban 2.5%, Built on 5.9%. Taking a probabilistic view, we can define the uniqueness of a category as 1-p. The rarest category we would encounter by selecting a random area in the UK is green urban (\(U = .98\)).

Data source: https://www.eea.europa.eu/publications/COR0-landcover

uk_landcover = tibble(cat=c('farm','natural','green urban','built'), pc=c(56.7, 34.9, 2.5, 5.9))
uk_landcover$p = uk_landcover$pc/100
uk_landcover$uniq = 1 - uk_landcover$p

uk_landcover
## # A tibble: 4 x 4
##   cat            pc     p  uniq
##   <chr>       <dbl> <dbl> <dbl>
## 1 farm         56.7 0.567 0.433
## 2 natural      34.9 0.349 0.651
## 3 green urban   2.5 0.025 0.975
## 4 built         5.9 0.059 0.941

To make it more interpretable, we can use the z score. This way, more unique observations emerge from the data.

uk_landcover$uniq_z = round(zscore(uk_landcover$uniq),2)

uk_landcover
## # A tibble: 4 x 5
##   cat            pc     p  uniq uniq_z[,1]
##   <chr>       <dbl> <dbl> <dbl>      <dbl>
## 1 farm         56.7 0.567 0.433      -1.24
## 2 natural      34.9 0.349 0.651      -0.39
## 3 green urban   2.5 0.025 0.975       0.88
## 4 built         5.9 0.059 0.941       0.74

We can think of uniqueness as a deviation from the uniform distribution. If all types of observation occur at the same probability, it is not possible to calculate a meaningful uniqueness score (\(z\) is NA).

uniform_tib = tibble(cat=c('cat1','cat2','cat3','cat4'), pc=1/4, uniq=1-1/4)
uniform_tib$uniq_z = zscore(uniform_tib$uniq)
print(uniform_tib)
## # A tibble: 4 x 4
##   cat      pc  uniq uniq_z[,1]
##   <chr> <dbl> <dbl>      <dbl>
## 1 cat1   0.25  0.75        NaN
## 2 cat2   0.25  0.75        NaN
## 3 cat3   0.25  0.75        NaN
## 4 cat4   0.25  0.75        NaN

We can use measures of entropy to see if the data has heterogeneity that can be used to quantify uniqueness. High entropy means low heterogeneity (no observation will be unique) and vice-versa.

print(paste('Entropy of uniform data', round(entropy(uniform_tib$pc),2)))
## [1] "Entropy of uniform data 1.39"
print(paste('Entropy of heterogeneous data', round(entropy(uk_landcover$pc),2)))
## [1] "Entropy of heterogeneous data 0.95"

3.2 Multi-variate uniqueness

The multivariate case is more interesting and challenging. It is useful for exploratory data analysis (EDA).

In this example, we consider four observations along two dimensions. The geometric interpretation of uniqueness of observation \(a\) in this case is \(u_a = \sum dist(a,b)\). To make the results more interpretable, we can obtain the \(z\) scores. It is possible to observe visually that point 1 is the most spatially isolated, while point 2 is the most central to the dataset and therefore has the lowest uniqueness.

The core function is dist_uniq(...) in the uniq_functions.R.

3.2.1 Minimal example

multiv_tib = as_tibble(matrix(c(-1,0,1,0,-1,0,1,1), ncol = 2))
print(multiv_tib)
## # A tibble: 4 x 2
##      V1    V2
##   <dbl> <dbl>
## 1    -1    -1
## 2     0     0
## 3     1     1
## 4     0     1
# plot
ggplot(multiv_tib, aes(x=V1, y=V2, label=rownames(multiv_tib))) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='blue', fill='blue') + geom_text(nudge_y = .1) + 
  ggtitle('Location of points')

# get distance matrix
dist_mat = as.matrix(dist(multiv_tib, diag = T, upper = T))
print(dist_mat)
##          1        2        3        4
## 1 0.000000 1.414214 2.828427 2.236068
## 2 1.414214 0.000000 1.414214 1.000000
## 3 2.828427 1.414214 0.000000 1.000000
## 4 2.236068 1.000000 1.000000 0.000000
# sum distances
multiv_tib$sum_dist = round(rowSums(dist_mat),2)
multiv_tib$sum_dist_z = round(zscore(multiv_tib$sum_dist),2)

# plot uniqueness
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist)) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) + 
  ggtitle('Sum of distances to other points')

# plot z scores of uniqueness 
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist_z)) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) + 
  ggtitle('Sum of distances to other points (z scores)')

3.2.2 Experiment with high dim

On high dimensional data, the interpretation of \(u\) is less immediate. Therefore, it is useful to observe the behaviour of the uniqueness measure \(u\) through a Monte Carlo method.

A core idea is that data with uniformly and normally distributed variables can still produce high values of \(u\), generating false positives in the search for unique observations. The Shapiro-Wilk test on \(u\) consistently returns low W on heterogeneous distributions with actual groups and high W on homogeneous data, regardless of scale.

3.2.2.1 Generate synth data

#t = gen_random_matrix('normal', 10, 5)
#u = calc_uniqueness(t, 'euclidean')
#View(u$uniqueness)
#ggplot(u$uniqueness, aes(x=uniq_dist_avg)) + xlab("uniq_dist_avg") + 
#  geom_histogram(color="white", fill="lightgreen", bins = 10)#
#ggplot(u$uniqueness, aes(x=uniq)) + #xlab("uniq_dist_avg") + 
#  geom_histogram(color="white", fill="gray", bins = 10)

if (T){
  # synthetic data
  res = tibble()
  set.seed(NULL)
  for (rand_data in c('normal','uniform','2groups','3groups'))
  for (dist_method in c('euclidean','manhattan','minkowski','mahalanobis'))
  for (data_scale in c(1,100,1000))
  for (row_n in c(10,100,1000,10000))
  for (col_n in c(2,10,20))
  for (trial in seq(10))
  {
    if (row_n == 0 | col_n == 0) next()
    #if (dist_method != 'mahalanobis') next()
    rand_tib = gen_random_matrix(rand_data, row_n, col_n, data_scale)
    utib = dist_uniq(rand_tib, dist_method)
    res_row = desc_stats_uniq(utib)
    res_row = res_row %>% add_column(distrib = rand_data, 
                             dist_method = dist_method,
                             data_scale = data_scale,
                             row_n = row_n, 
                             col_n = col_n, 
                             sample_n = nrow(utib),
                             trial = trial,
                             .before = "n")
    res = bind_rows(res, res_row)
  }
  
  saveRDS(res, 'analysis/high_dimensional_experiment_1.rds')
  openxlsx::write.xlsx(res, 'analysis/high_dimensional_experiment_1.xlsx')
}

3.2.2.2 Analyse results

Summarise experiment 1 results to observe trends in \(u\) in different settings.

res_tib = readRDS('analysis/high_dimensional_experiment_1.rds') 
  #%>% #mutate_if(is.character, factor)
print(nrow(res_tib))
## [1] 5760
summary_res = tibble()

for (cols in list(c('row_n'), c('col_n'), 
    c('distrib','row_n'), c('distrib','col_n'), c('distrib'),
    c('dist_method'), c('data_scale'),
    c('distrib','dist_method'),
    c('distrib','dist_method','data_scale'))){
  print(cols)
  cols_str = paste(cols, collapse = ' ')
  res_stats_tib = res_tib %>% 
    #group_by(distrib, dist_method) %>% 
    group_by(across(all_of(cols))) %>%  # all_of
    dplyr::summarise(cols = cols_str, 
      n = n(), 
      mn_dist_min = mean(min),
      mn_dist_max = mean(max),
      mn_vcommon = mean(pc_very_common),
      mn_common = mean(pc_common),
      mn_inbet = mean(pc_inbet),
      mn_rare = mean(pc_rare),
      mn_vrare = mean(pc_very_rare),
      mn_skew = mean(skewness)) %>% 
    mutate_if(is.numeric, round, 2)
  
  summary_res = bind_rows(summary_res, res_stats_tib)
}
## [1] "row_n"
## [1] "col_n"
## [1] "distrib" "row_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib" "col_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib"
## [1] "dist_method"
## [1] "data_scale"
## [1] "distrib"     "dist_method"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib"     "dist_method" "data_scale"
## `summarise()` has grouped output by 'distrib', 'dist_method'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Columns `distrib`, `dist_method`
summary_res
## # A tibble: 110 x 15
##    row_n cols            n mn_dist_min mn_dist_max mn_vcommon mn_common mn_inbet
##    <dbl> <chr>       <dbl>       <dbl>       <dbl>      <dbl>     <dbl>    <dbl>
##  1    10 row_n        1440       -1.03        2.01       87.9      3.38     5.68
##  2   100 row_n        1440       -1.16        3.37       88.3      3.17     4.77
##  3  1000 row_n        1440       -1.28        4.58       88.6      3.2      4.41
##  4 10000 row_n        1440       -1.28        4.96       88.7      3.19     4.38
##  5    NA col_n        1920       -0.85        4.77       89.0      3.05     4.31
##  6    NA col_n        1920       -1.32        3.45       88.2      3.32     4.71
##  7    NA col_n        1920       -1.39        2.98       87.8      3.33     5.41
##  8    10 distrib ro…   360       -0.68        2.09       83.6      2.64    11.2 
##  9   100 distrib ro…   360       -0.46        3.38       84.2      2.89     8.61
## 10  1000 distrib ro…   360       -0.46        4.72       84.8      3.23     7.24
## # … with 100 more rows, and 7 more variables: mn_rare <dbl>, mn_vrare <dbl>,
## #   mn_skew <dbl>, col_n <dbl>, distrib <chr>, dist_method <chr>,
## #   data_scale <dbl>

Impact of variables on distribution of uniqueness values.

Different distributions generate different common/rare distribution. 1,440 cases observed for each combination. Uniform and normally-distributed data generate a similar pattern, with normal data with more having more rare/very rare observations (2%, 1.1%). Uneven distributions with groups, as expected by their design, generate more rare/very rare observations (4.1% and 7.3%), having a large central group of close observations and relatively many distant observations:

Using the Dunn’s test, the only two distributions that are not significantly different are the normal and the uniform, which generate similar uniqueness patterns (TODO: why?).

# impact of data distribution
#summary_res %>% filter(cols == 'distrib') %>% dplyr::select(distrib, n, mn_dist_min, mn_dist_max, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% flextable()
summary_results_tib = summary_res

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'distrib') %>% 
  dplyr::select(distrib, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
tib %>% flextable()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$distrib){
  pcs = tib %>% filter(distrib == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(distrib == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1  group2     n1    n2 statistic        p    p.adj p.adj.signif
## * <chr> <chr>   <chr>   <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 val   2groups 3groups  1438  1436     8.14  3.88e-16 2.33e-15 ****        
## 2 val   2groups normal   1438  1437     3.03  2.44e- 3 7.33e- 3 **          
## 3 val   2groups uniform  1438  1438     2.50  1.25e- 2 2.50e- 2 *           
## 4 val   3groups normal   1436  1437    -5.11  3.19e- 7 1.28e- 6 ****        
## 5 val   3groups uniform  1436  1438    -5.65  1.65e- 8 8.23e- 8 ****        
## 6 val   normal  uniform  1437  1438    -0.533 5.94e- 1 5.94e- 1 ns

The size of the dataset (n observations x n variables) has minimal impact on the behaviour of \(u\). Varying the number of observations from 10 to 10,000, it is possible to observe that with very few observations (10) it is less likely to obtain very rare observations, as expected. The other three rows converge and do not have significant differences (using Dunn’s test):

Rows:

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'row_n') %>% 
  dplyr::select(row_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$row_n){
  pcs = tib %>% filter(row_n == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(row_n == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1 group2    n1    n2 statistic       p  p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>   <dbl>  <dbl> <chr>       
## 1 val   10     100     1438  1438     2.34  0.0190  0.0761 ns          
## 2 val   10     1000    1438  1439     2.68  0.00736 0.0368 *           
## 3 val   10     10000   1438  1437     2.78  0.00537 0.0322 *           
## 4 val   100    1000    1438  1439     0.335 0.738   1      ns          
## 5 val   100    10000   1438  1437     0.439 0.660   1      ns          
## 6 val   1000   10000   1439  1437     0.105 0.916   1      ns

Columns do not have impact on the results:

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'col_n') %>% 
  dplyr::select(col_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$col_n){
  print(i)
  pcs = tib %>% filter(col_n == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(col_n == i) %>% dplyr::select(n) %>% .[[1]]
  print(paste('generating synth data:',sz))
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}
## [1] 2
## [1] "generating synth data: 1920"
## [1] 10
## [1] "generating synth data: 1920"
## [1] 20
## [1] "generating synth data: 1920"
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
##   .y.   group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
## 1 val   2      10      1917  1917    -1.49  0.137  0.274  ns          
## 2 val   2      20      1917  1916    -2.23  0.0260 0.0779 ns          
## 3 val   10     20      1917  1916    -0.740 0.460  0.460  ns

The four distance methods do not generate significantly different proportions of \(u\) results.

# compare distance metrics
tib = summary_results_tib %>% filter(cols == 'dist_method') %>% 
  dplyr::select(dist_method, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$dist_method){
  pcs = tib %>% filter(dist_method == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(dist_method == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1      group2         n1    n2 statistic     p p.adj p.adj.signif
## * <chr> <chr>       <chr>       <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 val   euclidean   mahalanobis  1437  1437    1.01   0.313     1 ns          
## 2 val   euclidean   manhattan    1437  1437   -0.111  0.912     1 ns          
## 3 val   euclidean   minkowski    1437  1437   -0.128  0.898     1 ns          
## 4 val   mahalanobis manhattan    1437  1437   -1.12   0.263     1 ns          
## 5 val   mahalanobis minkowski    1437  1437   -1.14   0.256     1 ns          
## 6 val   manhattan   minkowski    1437  1437   -0.0174 0.986     1 ns

Data scale don’t make any difference to the results either.

# compare data scale
tib = summary_results_tib %>% filter(cols == 'data_scale') %>% 
  dplyr::select(data_scale, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$data_scale){
  pcs = tib %>% filter(data_scale == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(data_scale == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
##   .y.   group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 val   1      100     1918  1917    0.248  0.805     1 ns          
## 2 val   1      1000    1918  1918    0.297  0.766     1 ns          
## 3 val   100    1000    1917  1918    0.0499 0.960     1 ns

4 Case studies

4.1 London (land use and POIs)

TODO

4.2 London boroughs

Comparing boroughs of Greater London w.r.t. socio-economic variables.

4.2.1 Prep data

lnd_boro = st_read('data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson')
## Reading layer `london_boroughs_profiles_2015' from data source `/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/calculating-uniqueness/data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson' using driver `GeoJSON'
## Simple feature collection with 33 features and 89 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -0.5103751 ymin: 51.28676 xmax: 0.3340156 ymax: 51.69187
## geographic CRS: WGS 84
# select variables
print(names(lnd_boro))
##  [1] "fid"                                                                                          
##  [2] "gss"                                                                                          
##  [3] "name"                                                                                         
##  [4] "hectares"                                                                                     
##  [5] "nonld_area"                                                                                   
##  [6] "ons_inner"                                                                                    
##  [7] "area_name"                                                                                    
##  [8] "inner_outer_london"                                                                           
##  [9] "gla_population_estimate_2017"                                                                 
## [10] "gla_household_estimate_2017"                                                                  
## [11] "inland_area_hectares"                                                                         
## [12] "population_density_per_hectare_2017"                                                          
## [13] "average_age_2017"                                                                             
## [14] "proportion_of_population_aged_0_15_2015"                                                      
## [15] "proportion_of_population_of_working_age_2015"                                                 
## [16] "proportion_of_population_aged_65_and_over_2015"                                               
## [17] "net_internal_migration_2015"                                                                  
## [18] "net_international_migration_2015"                                                             
## [19] "net_natural_change_2015"                                                                      
## [20] "pc_of_resident_population_born_abroad_2015"                                                   
## [21] "largest_migrant_population_by_country_of_birth_2011"                                          
## [22] "pc_of_largest_migrant_population_2011"                                                        
## [23] "second_largest_migrant_population_by_country_of_birth_2011"                                   
## [24] "pc_of_second_largest_migrant_population_2011"                                                 
## [25] "third_largest_migrant_population_by_country_of_birth_2011"                                    
## [26] "pc_of_third_largest_migrant_population_2011"                                                  
## [27] "pc_of_population_from_bame_groups_2016"                                                       
## [28] "pc_people_aged_3_whose_main_language_is_not_english_2011_census"                              
## [29] "overseas_nationals_entering_the_uk_nino_2015_16"                                              
## [30] "new_migrant_nino_rates_2015_16"                                                               
## [31] "largest_migrant_population_arrived_during_2015_16"                                            
## [32] "second_largest_migrant_population_arrived_during_2015_16"                                     
## [33] "third_largest_migrant_population_arrived_during_2015_16"                                      
## [34] "employment_rate_pc_2015"                                                                      
## [35] "male_employment_rate_2015"                                                                    
## [36] "female_employment_rate_2015"                                                                  
## [37] "unemployment_rate_2015"                                                                       
## [38] "youth_unemployment_claimant_rate_18_24_dec_15"                                                
## [39] "proportion_of_16_18_year_olds_who_are_neet_pc_2014"                                           
## [40] "proportion_of_the_working_age_population_who_claim_out_of_work_benefits_pc_may_2016"          
## [41] "pc_working_age_with_a_disability_2015"                                                        
## [42] "proportion_of_working_age_people_with_no_qualifications_pc_2015"                              
## [43] "proportion_of_working_age_with_degree_or_equivalent_and_above_pc_2015"                        
## [44] "gross_annual_pay_2016"                                                                        
## [45] "gross_annual_pay_male_2016"                                                                   
## [46] "gross_annual_pay_female_2016"                                                                 
## [47] "modelled_household_median_income_estimates_2012_13"                                           
## [48] "pc_adults_that_volunteered_in_past_12_months_2010_11_to_2012_13"                              
## [49] "number_of_jobs_by_workplace_2014"                                                             
## [50] "pc_of_employment_that_is_in_public_sector_2014"                                               
## [51] "jobs_density_2015"                                                                            
## [52] "number_of_active_businesses_2015"                                                             
## [53] "two_year_business_survival_rates_started_in_2013"                                             
## [54] "crime_rates_per_thousand_population_2014_15"                                                  
## [55] "fires_per_thousand_population_2014"                                                           
## [56] "ambulance_incidents_per_hundred_population_2014"                                              
## [57] "median_house_price_2015"                                                                      
## [58] "average_band_d_council_tax_charge__2015_16"                                                   
## [59] "new_homes_net_2015_16_provisional"                                                            
## [60] "homes_owned_outright_2014_pc"                                                                 
## [61] "being_bought_with_mortgage_or_loan_2014_pc"                                                   
## [62] "rented_from_local_authority_or_housing_association_2014_pc"                                   
## [63] "rented_from_private_landlord_2014_pc"                                                         
## [64] "pc_of_area_that_is_greenspace_2005"                                                           
## [65] "total_carbon_emissions_2014"                                                                  
## [66] "household_waste_recycling_rate_2014_15"                                                       
## [67] "number_of_cars_2011_census"                                                                   
## [68] "number_of_cars_per_household_2011_census"                                                     
## [69] "pc_of_adults_who_cycle_at_least_once_per_month_2014_15"                                       
## [70] "average_public_transport_accessibility_score_2014"                                            
## [71] "achievement_of_5_or_more_a_c_grades_at_gcse_or_equivalent_including_english_and_maths_2013_14"
## [72] "rates_of_children_looked_after_2016"                                                          
## [73] "pc_of_pupils_whose_first_language_is_not_english_2015"                                        
## [74] "pc_children_living_in_out_of_work_households_2015"                                            
## [75] "male_life_expectancy_2012_14"                                                                 
## [76] "female_life_expectancy_2012_14"                                                               
## [77] "teenage_conception_rate_2014"                                                                 
## [78] "life_satisfaction_score_2011_14_out_of_10"                                                    
## [79] "worthwhileness_score_2011_14_out_of_10"                                                       
## [80] "happiness_score_2011_14_out_of_10"                                                            
## [81] "anxiety_score_2011_14_out_of_10"                                                              
## [82] "childhood_obesity_prevalance_pc_2015_16"                                                      
## [83] "people_aged_17_with_diabetes_pc"                                                              
## [84] "mortality_rate_from_causes_considered_preventable_2012_14"                                    
## [85] "political_control_in_council"                                                                 
## [86] "proportion_of_seats_won_by_conservatives_in_2014_election"                                    
## [87] "proportion_of_seats_won_by_labour_in_2014_election"                                           
## [88] "proportion_of_seats_won_by_lib_dems_in_2014_election"                                         
## [89] "turnout_at_2014_local_elections"                                                              
## [90] "geometry"
lnd_boro = lnd_boro %>% select(gss, name, population_density_per_hectare_2017, average_age_2017, pc_of_resident_population_born_abroad_2015, employment_rate_pc_2015, modelled_household_median_income_estimates_2012_13, proportion_of_seats_won_by_conservatives_in_2014_election, proportion_of_seats_won_by_labour_in_2014_election, median_house_price_2015) %>% 
  rename(id = gss,
         pop_dens = population_density_per_hectare_2017,
         average_age = average_age_2017,
         pop_born_abroad = pc_of_resident_population_born_abroad_2015,
         employment = employment_rate_pc_2015,
         household_income=modelled_household_median_income_estimates_2012_13,
         conservative_seats=proportion_of_seats_won_by_conservatives_in_2014_election,
         labour_seats=proportion_of_seats_won_by_labour_in_2014_election,
         house_price=median_house_price_2015)
plot(lnd_boro %>% select(-id,-name), border=NA)

# integrate missing data
# from https://data.gov.uk/dataset/248f5f04-23cf-4470-9216-0d0be9b877a8/london-borough-profiles-and-atlas/datafile/1e5d5226-a1a7-4097-afbf-d3bd39226c39/preview
#lnd_boro[lnd_boro$name == "Kensington and Chelsea", "gross_annual_pay_2016"] = 63620

Find problematic variables.

# get correlations
lnd_boro_corr = lnd_boro %>% st_drop_geometry() %>% select(-name,-id) %>% correlate(method='kendall') %>% stretch()
## 
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# observe high correlations
lnd_boro_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 10 x 3
##    x                  y                       r
##    <chr>              <chr>               <dbl>
##  1 pop_dens           average_age        -0.514
##  2 average_age        pop_dens           -0.514
##  3 average_age        conservative_seats  0.586
##  4 average_age        labour_seats       -0.627
##  5 household_income   conservative_seats  0.501
##  6 conservative_seats average_age         0.586
##  7 conservative_seats household_income    0.501
##  8 conservative_seats labour_seats       -0.697
##  9 labour_seats       average_age        -0.627
## 10 labour_seats       conservative_seats -0.697
# remove Labour variable
lnd_boro = lnd_boro %>% select(-labour_seats)
lnd_boro_desc_stats = t(stat.desc(lnd_boro %>% st_drop_geometry() %>% select(-id,-name), norm=T))
lnd_boro_desc_stats
##                    nbr.val nbr.null nbr.na          min          max
## pop_dens                33        0      0     21.84036     155.6205
## average_age             33        0      0     31.40000      43.2000
## pop_born_abroad         32        0      1     10.90000      54.1000
## employment              33        0      0     64.60000      79.6000
## household_income        33        0      0  28780.00000   63620.0000
## conservative_seats      32        5      1      0.00000      85.0000
## house_price             33        0      0 243500.00000 1200000.0000
##                          range          sum       median         mean
## pop_dens              133.7801     2457.754     59.17539     74.47740
## average_age            11.8000     1200.400     36.20000     36.37576
## pop_born_abroad        43.2000     1168.400     36.90000     36.51250
## employment             15.0000     2399.600     73.10000     72.71515
## household_income    34840.0000  1308190.000  37040.00000  39642.12121
## conservative_seats     85.0000     1046.898     30.00000     32.71556
## house_price        956500.0000 15360443.000 410000.00000 465467.96970
##                         SE.mean CI.mean.0.95          var      std.dev
## pop_dens           6.856818e+00    13.966880 1.551526e+03 3.938942e+01
## average_age        4.330790e-01     0.882153 6.189394e+00 2.487849e+00
## pop_born_abroad    1.855380e+00     3.784072 1.101579e+02 1.049561e+01
## employment         7.345005e-01     1.496128 1.780320e+01 4.219384e+00
## household_income   1.287259e+03  2622.061541 5.468221e+07 7.394742e+03
## conservative_seats 4.776588e+00     9.741915 7.301052e+02 2.702046e+01
## house_price        3.557386e+04 72461.579214 4.176148e+10 2.043563e+05
##                      coef.var   skewness   skew.2SE   kurtosis    kurt.2SE
## pop_dens           0.52887744  0.5506993  0.6738271 -0.9455118 -0.59211886
## average_age        0.06839306  0.4105474  0.5023395  0.2371737  0.14852807
## pop_born_abroad    0.28745261 -0.4072860 -0.4913485 -0.1454105 -0.08982929
## employment         0.05802620 -0.2030418 -0.2484388 -1.0326806 -0.64670761
## household_income   0.18653750  1.3316678  1.6294081  1.8164895  1.13756141
## conservative_seats 0.82592087  0.3562270  0.4297511 -1.2819967 -0.79197072
## house_price        0.43903399  1.8238259  2.2316052  3.3048996  2.06966585
##                    normtest.W   normtest.p
## pop_dens            0.9229907 2.219048e-02
## average_age         0.9747445 6.211098e-01
## pop_born_abroad     0.9603938 2.815670e-01
## employment          0.9631147 3.158213e-01
## household_income    0.8804765 1.723490e-03
## conservative_seats  0.9128969 1.340429e-02
## house_price         0.7935325 2.435519e-05
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).

The only variable whose skewness is a concern is median_house_price_2015. It will be normalised with log10.

# normalise and scale
lnd_boro$house_price = log10(lnd_boro$house_price+1)
lnd_boro$household_income = log10(lnd_boro$household_income+1)

# skewness is now acceptable
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Multidimensional scaling of boroughs.

mds <- lnd_boro %>% select(-name,-id) %>% st_drop_geometry() %>% 
  mutate_if(is.numeric, scale) %>% dist() %>% cmdscale() %>% as_tibble()
colnames(mds) <- c("Dim1", "Dim2")
mds$name = lnd_boro$name

# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2)) +
  ggtitle("Multi-dimensional scaling of Greater London boroughs") + 
  geom_point(size=.2) +
  geom_text(
    label=mds$name, 
    size=2,
    #nudge_x = .25, 
    nudge_y = .1, 
    check_overlap = F
  ) + theme_bw()

rm(mds)

4.2.2 Calc uniqueness

Calculate uniqueness of boroughs based on all variables.

# 'euclidean','manhattan', 'minkowski', mahalanobis'

# filter out very rare
#lnd_boro = lnd_boro %>% filter(!(id %in% c('E09000001','E09000020')))

method = 'euclidean'
lnd_boro_uniq = dist_uniq(lnd_boro %>% st_drop_geometry(), method)
## [1] "dist_uniq 33 9 euclidean"
## [1] "scaling data..."
#lnd_boro_u_man = dist_uniq(lnd_boro %>% st_drop_geometry(), 'manhattan')
#lnd_boro_u_mink = dist_uniq(lnd_boro %>% st_drop_geometry(), 'minkowski')
#lnd_boro_u_maha = dist_uniq(lnd_boro %>% st_drop_geometry(), 'mahalanobis', impute_na = T)

openxlsx::write.xlsx(lnd_boro_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/london_boroughs_uniqueness-',method,'.xlsx'))

4.2.3 Analyse and viz

# bar chart
#lnd_boro_uniq %>% select(sum_dists_class) %>% plot()
# add geometries
lnd_boro_uniq_geo = merge(lnd_boro %>% select(id), lnd_boro_uniq)

# maps
plot_uniq_map(lnd_boro_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

# heatmaps
gen_variable_heatmap(lnd_boro_uniq_geo, T, 50)
## # A tibble: 297 x 3
##    row_id                 col_name           value[,1]
##    <chr>                  <chr>                  <dbl>
##  1 City of London         pop_dens           -1.12    
##  2 City of London         average_age         2.74    
##  3 City of London         pop_born_abroad    NA       
##  4 City of London         employment         -1.92    
##  5 City of London         household_income    2.84    
##  6 City of London         conservative_seats NA       
##  7 City of London         house_price         1.69    
##  8 City of London         sum_dists_z         3.43    
##  9 City of London         sum_dists_z_p       0.000297
## 10 Kensington and Chelsea pop_dens            1.44    
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).

gen_variable_heatmap(lnd_boro_uniq_geo, F, 50)
## # A tibble: 297 x 3
##    row_id         col_name           value[,1]
##    <chr>          <chr>                  <dbl>
##  1 Waltham Forest pop_dens             -0.0839
##  2 Waltham Forest average_age          -0.513 
##  3 Waltham Forest pop_born_abroad       0.0655
##  4 Waltham Forest employment            0.0912
##  5 Waltham Forest household_income     -0.964 
##  6 Waltham Forest conservative_seats   -0.224 
##  7 Waltham Forest house_price          -0.463 
##  8 Waltham Forest sum_dists_z          -1.03  
##  9 Waltham Forest sum_dists_z_p         0.849 
## 10 Greenwich      pop_dens             -0.388 
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).

# choropleth
# Based on <https://mharinga.github.io/choropleth.html> (accessed in May 2022).
#u_colors = rev(toupper(c('#f0f9e8','#bae4bc','#7bccc4','#43a2ca','#0868ac')))
#lnd_boro_u_geo %>% select(sum_dists_class) %>% plot(border = 'lightgray', 
#        pal = u_colors,
#        main='Uniqueness of London boroughs')

4.3 Countries (World Bank)

Calculate uniqueness of countries based on World Bank indicators (2019).

4.3.1 Prep data

Select countries with at least 1 million people:

wb_geo = read_sf("data/world_bank/indicators_by_year/world_bank_indicators_countries-2019.geojson") %>% filter(!is.na(iso3c) & sp.pop.totl >= 1e6 & iso_a3 != '-99')
names(wb_geo)
##  [1] "iso_a2"            "iso_a3"            "name"             
##  [4] "name_long"         "abbrev"            "type"             
##  [7] "continent"         "region_un"         "subregion"        
## [10] "region_wb"         "un_a3"             "wb_a2"            
## [13] "wb_a3"             "iso3c"             "country"          
## [16] "date"              "it.net.user.zs"    "ms.mil.xpnd.gd.zs"
## [19] "ne.trd.gnfs.zs"    "ny.gdp.mktp.cd"    "ny.gdp.mktp.pp.cd"
## [22] "sl.isv.ifrm.zs"    "sp.pop.grow"       "sp.pop.totl"      
## [25] "sp.rur.totl.zs"    "sp.urb.totl.in.zs" "region"           
## [28] "income_group"      "eu"                "geometry"
nrow(wb_geo)
## [1] 157
stopifnot(is_unique(wb_geo$iso_a3))
stopifnot(is_unique(wb_geo$iso_a2))

# use Winkel Tripel for visualisations
crs_wintri <- "+proj=wintri +datum=WGS84 +no_defs +over"
wb_geo <- st_transform(wb_geo, crs = crs_wintri)

Select and rename variables.

wb_tib = wb_geo %>% st_drop_geometry() %>% rename(
  population = sp.pop.totl,
  pop_growth = sp.pop.grow,
  urban_pop = sp.urb.totl.in.zs,
  gdp = ny.gdp.mktp.pp.cd,
  gdp_trade = ne.trd.gnfs.zs,
  internet_access = it.net.user.zs,
  military_exp = ms.mil.xpnd.gd.zs
) %>% mutate(pc_gdp = gdp / population) %>%
select(iso_a2, iso_a3, name, region_un, population, pop_growth, gdp, pc_gdp,
             urban_pop, internet_access, military_exp) # income_group, 

wb_tib = wb_tib 

wb_tib %>% sample_n(10)
## # A tibble: 10 x 11
##    iso_a2 iso_a3 name  region_un population pop_growth      gdp pc_gdp urban_pop
##    <chr>  <chr>  <chr> <chr>          <dbl>      <dbl>    <dbl>  <dbl>     <dbl>
##  1 KG     KGZ    Kyrg… Asia         6456900      2.10   3.54e10  5486.      36.6
##  2 IE     IRL    Irel… Europe       4941444      1.51   4.36e11 88241.      63.4
##  3 SE     SWE    Swed… Europe      10285453      1.08   5.74e11 55820.      87.7
##  4 ZM     ZMB    Zamb… Africa      17861030      2.89   6.47e10  3624.      44.1
##  5 SK     SVK    Slov… Europe       5454073      0.134  1.86e11 34067.      53.7
##  6 LK     LKA    Sri … Asia        21803000      0.612  2.98e11 13657.      18.6
##  7 VE     VEN    Vene… Americas    28515829     -1.24  NA          NA       88.2
##  8 CA     CAN    Cana… Americas    37589262      1.42   1.93e12 51342.      81.5
##  9 MX     MEX    Mexi… Americas   127575529      1.09   2.63e12 20582.      80.4
## 10 SS     SSD    S. S… Africa      11062113      0.782 NA          NA       19.9
## # … with 2 more variables: internet_access <dbl>, military_exp <dbl>

Transform problematic variables:

# get correlations
wb_corr = wb_tib %>% select_if(is.numeric) %>% correlate(method='kendall', use="na.or.complete") %>% stretch()
## 
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
# observe high correlations
wb_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 8 x 3
##   x               y                   r
##   <chr>           <chr>           <dbl>
## 1 population      gdp             0.510
## 2 gdp             population      0.510
## 3 pc_gdp          urban_pop       0.553
## 4 pc_gdp          internet_access 0.794
## 5 urban_pop       pc_gdp          0.553
## 6 urban_pop       internet_access 0.554
## 7 internet_access pc_gdp          0.794
## 8 internet_access urban_pop       0.554
# histograms to see normality
ggplot(gather(wb_tib %>% select_if(is.numeric)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).

The only variables whose skewness are a serious concern are population and gdp. They will be normalised with log10.

# transform and scale
wb_trans_tib = wb_tib %>% mutate( gdp = log10(gdp+1), 
                                population = log10(population+1),
                                pc_gdp = log10(pc_gdp+1),
                                military_exp = log10(military_exp+1))

ggplot(gather(wb_trans_tib %>% select_if(is.numeric)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).

Multi-dimensional scaling of countries to show clustering:

mds <- wb_trans_tib %>% select(-iso_a2,-name) %>% 
  mutate_if(is.numeric, scale) %>%
  dist() %>% cmdscale() %>% as_tibble()
## Warning in dist(.): NAs introduced by coercion
colnames(mds) <- c("Dim1", "Dim2")
mds$name = wb_trans_tib$iso_a3
mds$region_un = wb_trans_tib$region_un

# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2, color=region_un)) +
  ggtitle("Multi-dimensional scaling of countries") + 
  #geom_point(size=0) +
  geom_text(
    label = mds$name, 
    size = 2,
    #nudge_x = .25, nudge_y = .25
    #check_overlap = T
  ) + theme_bw()

rm(mds)

4.3.2 Calc uniqueness

#meth = 'mahalanobis'
meth = 'manhattan'
#meth = 'euclidean'
filt_wb_trans_tib = wb_trans_tib %>% select(-population,-gdp)
  
wb_uniq = dist_uniq(filt_wb_trans_tib, meth, impute_na = T)
## [1] "dist_uniq 157 9 manhattan"
## [1] "imputing missing values..."
## [1] "scaling data..."
summary(wb_uniq)
##     iso_a2             iso_a3              name            region_un        
##  Length:157         Length:157         Length:157         Length:157        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     pop_growth.V1          pc_gdp.V1           urban_pop.V1    
##  Min.   :-2.7301313   Min.   :-2.3996204   Min.   :-2.1146868  
##  1st Qu.:-0.7484615   1st Qu.:-0.7282080   1st Qu.:-0.7710306  
##  Median : 0.0270864   Median : 0.0835194   Median : 0.0424196  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.7608670   3rd Qu.: 0.8291936   3rd Qu.: 0.8433711  
##  Max.   : 2.7727087   Max.   : 1.8291785   Max.   : 1.7855400  
##   internet_access.V1    military_exp.V1     sum_dists      sum_dists_dec   
##  Min.   :-1.8469114   Min.   :-2.418894   Min.   : 669.1   Min.   : 1.000  
##  1st Qu.:-0.9510277   1st Qu.:-0.531860   1st Qu.: 779.1   1st Qu.: 3.000  
##  Median : 0.2865862   Median :-0.084567   Median : 854.3   Median : 5.000  
##  Mean   : 0.0000000   Mean   : 0.000000   Mean   : 885.0   Mean   : 5.433  
##  3rd Qu.: 0.8885776   3rd Qu.: 0.412638   3rd Qu.: 961.8   3rd Qu.: 8.000  
##  Max.   : 1.4961384   Max.   : 3.388462   Max.   :1361.3   Max.   :10.000  
##   sum_dists_z      sum_dists_z_p          sum_dists_class sum_dists_p_thresh
##  Min.   :-1.4363   Min.   :0.0007658   very rare  :  2    ***:  2           
##  1st Qu.:-0.7043   1st Qu.:0.3046804   rare       :  2    ** :  2           
##  Median :-0.2043   Median :0.5809292   medium     :  7    *  :  7           
##  Mean   : 0.0000   Mean   :0.5255461   common     :  7    .  :  7           
##  3rd Qu.: 0.5110   3rd Qu.:0.7593628   very common:139       :139           
##  Max.   : 3.1686   Max.   :0.9245428                                        
##  dist_method       
##  Length:157        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
openxlsx::write.xlsx(wb_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/wb_countries_uniqueness-',method,'.xlsx'))

wb_uniq %>% head(20)
## # A tibble: 20 x 16
##    iso_a2 iso_a3 name          region_un pop_growth[,1] pc_gdp[,1] urban_pop[,1]
##    <chr>  <chr>  <chr>         <chr>              <dbl>      <dbl>         <dbl>
##  1 BI     BDI    Burundi       Africa             1.60     -2.40          -2.11 
##  2 NE     NER    Niger         Africa             2.18     -1.98          -1.97 
##  3 BH     BHR    Bahrain       Asia               2.77      1.16           1.31 
##  4 OM     OMN    Oman          Asia               1.46      0.724          1.13 
##  5 KW     KWT    Kuwait        Asia               0.320     1.25           1.79 
##  6 SA     SAU    Saudi Arabia  Asia               0.320     1.20           1.07 
##  7 MW     MWI    Malawi        Africa             1.17     -2.10          -1.94 
##  8 CD     COD    Dem. Rep. Co… Africa             1.65     -2.07          -0.685
##  9 TD     TCD    Chad          Africa             1.47     -1.75          -1.66 
## 10 PG     PNG    Papua New Gu… Oceania            0.567    -0.873         -2.11 
## 11 UG     UGA    Uganda        Africa             1.97     -1.47          -1.62 
## 12 MD     MDA    Moldova       Europe            -2.73      0.0816        -0.789
## 13 MZ     MOZ    Mozambique    Africa             1.40     -1.94          -1.07 
## 14 ET     ETH    Ethiopia      Africa             1.12     -1.46          -1.76 
## 15 MG     MDG    Madagascar    Africa             1.18     -1.72          -1.01 
## 16 SG     SGP    Singapore     Asia              -0.143     1.83           1.79 
## 17 IL     ISR    Israel        Asia               0.521     1.06           1.45 
## 18 LR     LBR    Liberia       Africa             0.982    -1.84          -0.390
## 19 TG     TGO    Togo          Africa             0.976    -1.75          -0.811
## 20 BF     BFA    Burkina Faso  Africa             1.35     -1.47          -1.36 
## # … with 9 more variables: internet_access <dbl[,1]>, military_exp <dbl[,1]>,
## #   sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## #   sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## #   dist_method <chr>

4.3.3 Analyse and viz

Based on https://r-spatial.github.io/sf/articles/sf5.html

# correlations
corr_tib = wb_uniq %>% select_if(is.numeric) %>% correlate(method='kendall') %>% stretch() %>% filter(x=='sum_dists') %>% arrange(-r)
## 
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# plot distributions
ggplot(gather(wb_uniq %>% select_if(is.numeric) %>% 
                select(sum_dists, sum_dists_z, sum_dists_z_p)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')

# data for maps
wb_uniq_geo = merge(wb_geo %>% select(iso_a3), wb_uniq, by='iso_a3')

Generate world maps with uniqueness information:

wb_uniq_geo <- st_transform(wb_uniq_geo, crs = crs_wintri)
plot_uniq_map(wb_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

gen_variable_heatmap(wb_uniq_geo)
## # A tibble: 210 x 3
##    row_id  col_name        value[,1]
##    <chr>   <chr>               <dbl>
##  1 Burundi pop_growth       1.60    
##  2 Burundi pc_gdp          -2.40    
##  3 Burundi urban_pop       -2.11    
##  4 Burundi internet_access -1.76    
##  5 Burundi military_exp     0.174   
##  6 Burundi sum_dists_z      3.17    
##  7 Burundi sum_dists_z_p    0.000766
##  8 Niger   pop_growth       2.18    
##  9 Niger   pc_gdp          -1.98    
## 10 Niger   urban_pop       -1.97    
## # … with 200 more rows

gen_variable_heatmap(wb_uniq_geo, F)
## # A tibble: 210 x 3
##    row_id   col_name        value[,1]
##    <chr>    <chr>               <dbl>
##  1 Bolivia  pop_growth         0.0801
##  2 Bolivia  pc_gdp            -0.269 
##  3 Bolivia  urban_pop          0.427 
##  4 Bolivia  internet_access   -0.377 
##  5 Bolivia  military_exp      -0.164 
##  6 Bolivia  sum_dists_z       -1.44  
##  7 Bolivia  sum_dists_z_p      0.925 
##  8 Paraguay pop_growth        -0.0373
##  9 Paraguay pc_gdp             0.0570
## 10 Paraguay urban_pop          0.0716
## # … with 200 more rows

4.4 EU regions (Eurostat)

European regions at NUTS 2 level (https://ec.europa.eu/eurostat/web/nuts/background).

4.4.1 Prep data

# load EU data and remove French Guyana for cartography
eu_geo = read_sf("data/eurostat/eu_vars_with_bounds/eu_nuts_2_eurostat_2018.geojson") %>%
  filter(!(NUTS_ID %in% c('FRY3','FRY2','FRY1','FRY4','FRY5')))

summary(eu_geo)
##    NUTS_ID               id             CNTR_CODE          NUTS_NAME        
##  Length:327         Length:327         Length:327         Length:327        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    LEVL_CODE     FID                geo            broadband_access_household
##  Min.   :2   Length:327         Length:327         Min.   : 84.0             
##  1st Qu.:2   Class :character   Class :character   1st Qu.: 97.0             
##  Median :2   Mode  :character   Mode  :character   Median : 99.0             
##  Mean   :2                                         Mean   : 97.5             
##  3rd Qu.:2                                         3rd Qu.: 99.0             
##  Max.   :2                                         Max.   :100.0             
##                                                    NA's   :152               
##  disp_income_household gender_employ_gap life_expectancy_f life_expectancy_m
##  Min.   :12300         Min.   : 0.70     Min.   :77.30     Min.   :70.1     
##  1st Qu.:16125         1st Qu.: 7.90     1st Qu.:82.00     1st Qu.:76.4     
##  Median :17450         Median :10.20     Median :83.50     Median :79.0     
##  Mean   :18742         Mean   :13.49     Mean   :83.33     Mean   :78.2     
##  3rd Qu.:20075         3rd Qu.:15.75     3rd Qu.:84.90     3rd Qu.:80.4     
##  Max.   :47000         Max.   :48.80     Max.   :88.10     Max.   :83.0     
##  NA's   :277           NA's   :4         NA's   :4         NA's   :4        
##  life_expectancy_t   region_gdp     region_population  unemployment_rate_f
##  Min.   :73.60     Min.   :  1365   Min.   :   29489   Min.   : 1.600     
##  1st Qu.:79.30     1st Qu.: 16739   1st Qu.:  940271   1st Qu.: 3.500     
##  Median :81.50     Median : 34901   Median : 1506232   Median : 5.450     
##  Mean   :80.79     Mean   : 53363   Mean   : 1889104   Mean   : 7.891     
##  3rd Qu.:82.60     3rd Qu.: 65104   3rd Qu.: 2310766   3rd Qu.: 9.350     
##  Max.   :85.50     Max.   :733875   Max.   :15029231   Max.   :36.300     
##  NA's   :4         NA's   :17                          NA's   :17         
##  unemployment_rate_m unemployment_rate_t          geometry  
##  Min.   : 0.800      Min.   : 1.300      MULTIPOLYGON :327  
##  1st Qu.: 3.800      1st Qu.: 3.600      epsg:4326    :  0  
##  Median : 5.300      Median : 5.400      +proj=long...:  0  
##  Mean   : 6.745      Mean   : 7.064                         
##  3rd Qu.: 8.200      3rd Qu.: 8.675                         
##  Max.   :22.900      Max.   :29.000                         
##  NA's   :14          NA's   :5
# project data to ETRS 89 LAEA, suitable for Europe
eu_geo <- eu_geo %>% st_transform(3035)

# get more variables
#eu_tib = read_tsv('data/eurostat/eu_all_geo_data.tsv')
#summary(eu_tib)

eu_geo = eu_geo %>% select(NUTS_ID, CNTR_CODE, NUTS_NAME, region_population, life_expectancy_t, region_gdp, unemployment_rate_t) %>% mutate(pc_gdp = region_gdp/region_population*1e6)

eu_geo %>% sample_n(10)
## Simple feature collection with 10 features and 8 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 3377114 ymin: 2143784 xmax: 5375179 ymax: 3390452
## projected CRS:  ETRS89-extended / LAEA Europe
## # A tibble: 10 x 9
##    NUTS_ID CNTR_CODE NUTS_NAME      region_populati… life_expectancy… region_gdp
##  * <chr>   <chr>     <chr>                     <dbl>            <dbl>      <dbl>
##  1 DE23    DE        Oberpfalz               1104407             80.9     47273.
##  2 ITC4    IT        Lombardia              10036258             84      388065.
##  3 PL43    PL        Lubuskie                1004127             76.9     10798.
##  4 RS22    RS        Регион Јужне …          1513383             75.3      6058.
##  5 ITH3    IT        Veneto                  4905037             84      163304.
##  6 PL21    PL        Małopolskie             3349498             79.1     40427.
##  7 NL33    NL        Zuid-Holland            3681044             81.9    163805.
##  8 FRG0    FR        Pays de la Lo…          3772852             83.3    119149.
##  9 ITI2    IT        Umbria                   884640             84.3     22482.
## 10 DE50    DE        Bremen                   681032             79.7     33708.
## # … with 3 more variables: unemployment_rate_t <dbl>,
## #   geometry <MULTIPOLYGON [m]>, pc_gdp <dbl>

Transform variables:

eu_geo %>% select_if(is.numeric) %>% st_drop_geometry() %>% correlate(method='kendall', use="na.or.complete") %>% stretch() %>% arrange(-r)
## 
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
## # A tibble: 25 x 3
##    x                   y                       r
##    <chr>               <chr>               <dbl>
##  1 region_population   region_gdp          0.509
##  2 region_gdp          region_population   0.509
##  3 region_gdp          pc_gdp              0.472
##  4 pc_gdp              region_gdp          0.472
##  5 life_expectancy_t   pc_gdp              0.384
##  6 pc_gdp              life_expectancy_t   0.384
##  7 life_expectancy_t   region_gdp          0.287
##  8 region_gdp          life_expectancy_t   0.287
##  9 life_expectancy_t   unemployment_rate_t 0.130
## 10 unemployment_rate_t life_expectancy_t   0.130
## # … with 15 more rows
# plot distributions
ggplot(gather(eu_geo %>% st_drop_geometry() %>% select_if(is.numeric)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).

# transform 
eu_trans_geo = eu_geo %>% mutate(region_population = log10(region_population+1),
  region_gdp = log10(region_gdp+1),
  pc_gdp = log10(pc_gdp+1),
  unemployment_rate_t = log10(unemployment_rate_t+1))

ggplot(gather(eu_trans_geo %>% st_drop_geometry() %>% select_if(is.numeric)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).

4.4.2 Calc uniqueness

# select variables for uniqueness
eu_trans_geo_filt = eu_trans_geo %>% select(-region_gdp)

#method = 'euclidean'
method = 'mahalanobis'
eu_uniq = dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method)
## [1] "dist_uniq 327 7 mahalanobis"
## [1] "scaling data..."
## Warning in dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method):
## distances could not be calculated for 22 cases. Set impute_na to TRUE to impute
## missing values.
openxlsx::write.xlsx(eu_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/eu_nuts2_uniqueness-',method,'.xlsx'))

eu_uniq %>% head(20) # %>% View(.)
## # A tibble: 20 x 14
##    NUTS_ID CNTR_CODE NUTS_NAME               region_populatio… life_expectancy_…
##    <chr>   <chr>     <chr>                               <dbl>             <dbl>
##  1 ES63    ES        Ciudad Autónoma de Ceu…           -3.39              0.287 
##  2 ES64    ES        Ciudad Autónoma de Mel…           -3.39             -0.0341
##  3 UKI3    UK        Inner London - West               -0.216             1.57  
##  4 BE10    BE        Région de Bruxelles-Ca…           -0.183             0.287 
##  5 BG31    BG        Северозападен                     -0.748            -2.88  
##  6 TRC2    TR        Sanliurfa, Diyarbakir              1.17             -0.555 
##  7 FR10    FR        Ile-de-France                      2.62              1.49  
##  8 TRA2    TR        Agri, Kars, Igdir, Ard…           -0.277            -1.12  
##  9 EL41    EL        Βόρειο Αιγαίο                     -2.29              0.848 
## 10 TR10    TR        Istanbul                           2.87             -0.555 
## 11 ES61    ES        Andalucía                          2.17              0.527 
## 12 EL53    EL        Δυτική Μακεδονία                  -2.00              0.888 
## 13 LV00    LV        Latvija                            0.388            -2.28  
## 14 TR90    TR        Trabzon                            0.761            -0.0742
## 15 TRB2    TR        Van, Mus, Bitlis, Hakk…            0.504            -0.956 
## 16 ITC2    IT        Valle d'Aosta/Vallée d…           -2.91              0.607 
## 17 EL54    EL        Ήπειρος                           -1.73              1.13  
## 18 TRB1    TR        Malatya, Elazig, Bingö…            0.251            -0.275 
## 19 LU00    LU        Luxembourg                        -1.02              0.607 
## 20 RS22    RS        Регион Јужне и Источне…            0.0917           -2.20  
## # … with 9 more variables: unemployment_rate_t <dbl[,1]>, pc_gdp <dbl[,1]>,
## #   sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## #   sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## #   dist_method <chr>

4.4.3 Analyse and viz

Maps and heat maps.

# data
eu_uniq_geo = merge(eu_geo %>% select(NUTS_ID), eu_uniq, by='NUTS_ID')
eu_uniq_geo$name = paste(eu_uniq_geo$CNTR_CODE, '-', eu_uniq_geo$NUTS_NAME)

# maps
plot_uniq_map(eu_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

# heatmaps
gen_variable_heatmap(eu_uniq_geo)
## # A tibble: 180 x 3
##    row_id                          col_name            value[,1]
##    <chr>                           <chr>                   <dbl>
##  1 ES - Ciudad Autónoma de Ceuta   region_population    -3.39e+0
##  2 ES - Ciudad Autónoma de Ceuta   life_expectancy_t     2.87e-1
##  3 ES - Ciudad Autónoma de Ceuta   unemployment_rate_t   2.71e+0
##  4 ES - Ciudad Autónoma de Ceuta   pc_gdp               -1.40e-1
##  5 ES - Ciudad Autónoma de Ceuta   sum_dists_z           5.69e+0
##  6 ES - Ciudad Autónoma de Ceuta   sum_dists_z_p         6.25e-9
##  7 ES - Ciudad Autónoma de Melilla region_population    -3.39e+0
##  8 ES - Ciudad Autónoma de Melilla life_expectancy_t    -3.41e-2
##  9 ES - Ciudad Autónoma de Melilla unemployment_rate_t   2.50e+0
## 10 ES - Ciudad Autónoma de Melilla pc_gdp               -2.53e-1
## # … with 170 more rows

gen_variable_heatmap(eu_uniq_geo, F)
## # A tibble: 180 x 3
##    row_id                 col_name            value[,1]
##    <chr>                  <chr>                   <dbl>
##  1 UK - South Yorkshire   region_population    -0.00487
##  2 UK - South Yorkshire   life_expectancy_t    -0.235  
##  3 UK - South Yorkshire   unemployment_rate_t  -0.247  
##  4 UK - South Yorkshire   pc_gdp                0.161  
##  5 UK - South Yorkshire   sum_dists_z          -1.20   
##  6 UK - South Yorkshire   sum_dists_z_p         0.886  
##  7 SI - Vzhodna Slovenija region_population    -0.305  
##  8 SI - Vzhodna Slovenija life_expectancy_t    -0.114  
##  9 SI - Vzhodna Slovenija unemployment_rate_t  -0.129  
## 10 SI - Vzhodna Slovenija pc_gdp               -0.278  
## # … with 170 more rows

End of notebook